library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:graphics':
## 
##     layout
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
library(tidyverse)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
library(ggrepel)
library(broom)
library(dplyr)

theme_set(theme_minimal())
recent_grads <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018-10-16/recent-grads.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Major = col_character(),
##   Major_category = col_character()
## )
## See spec(...) for full column specifications.
# we didn't get this dataset first, we get this data by plot the raw data frist, and figure out how we what processed the data
majors_processed <- recent_grads %>%
  arrange(desc(Median)) %>%
  mutate(Major = str_to_title(Major),
        Major = fct_reorder(Major, Median))

I’ll also be aggregating by cateogyr.

by_majoy_cateotry <- majors_processed %>%
  dplyr::filter(!is.na(Total)) %>%
  group_by(Major_category) %>%
  dplyr::summarize(Men = sum(Men),
               Women = sum(Women),
               Total = sum(Total),
            MedianSalary = sum(Median * Sample_size) / sum(Sample_size)) %>%
  mutate(ShareWomen = Women / Total) %>%
  arrange(desc(ShareWomen))

Most common Majors

What major categories (e.g. enginerring, psycholog, business) were most common?

by_majoy_cateotry %>%
  mutate(Major_category = fct_reorder(Major_category, Total)) %>%
  gather(Gender, Number, Men, Women) %>%
  ggplot(aes(Major_category, Number, fill = Gender)) +
  geom_col() +
  scale_y_continuous(labels = comma_format()) +
  coord_flip() +
  labs(title = "What are the most common major category",
       x = "",
       y = "Total # of graduess")

what categories of majors make more money than others

recent_grads %>%
  mutate(Major_category = fct_reorder(Major_category, Median)) %>%
  ggplot(aes(Major_category, Median, fill = Major_category)) +
  geom_boxplot() +
  scale_y_continuous(labels = dollar_format()) +
  expand_limits(y = 0) +
  coord_flip() +
  theme(legend.position = "none")

what are the highest earning majors?

majors_processed %>%
  head(20) %>%
  ggplot(aes(Major,Median, color = Major_category)) +
  scale_y_continuous(labels = dollar_format()) +
  geom_point() +
  geom_errorbar(aes(ymin = P25th, ymax = P75th)) +
  coord_flip() +
  expand_limits(y = 0)

majors_processed %>%
  filter(Sample_size >= 100) %>%
  head(20) %>%
  ggplot(aes(Major,Median, color = Major_category)) +
  scale_y_continuous(labels = dollar_format()) +
  geom_point() +
  geom_errorbar(aes(ymin = P25th, ymax = P75th)) +
  coord_flip() +
  expand_limits(y = 0) +
  labs(title = "What are the highest-earning majors",
       subtitle = "Top 20 majors with at least 100 graduates survey. Bars represent the 25th to 75th percentage",
       x = "",
       y = "median salary graduates")

Futuer work

Appendix

majors_processed %>%
  ggplot(aes(Sample_size, Median)) +
  geom_point() +
  geom_text(aes(label = Major), check_overlap = T, vjust = 1, hjust = 1) +
  scale_x_log10()

knitr::knit_exit()